 /* 
PROGRAM BY:  JEFFREY WANG
Last Edited: 11/25/2019
Last edit Feb 26 2020 by xiang. pulling in additional AUX variables from 1977-1992:
frdt frad fwhs fedp fsal fapr
Last edit Feb 29 2020 by xiang. pulling in additional CMF vars: ee cf cr cw td
** NOTE: TR in CCN and TR in CMF mean different things. rename TR in CMF to trcmf

NOTE: This program is based on programs by Xiang. Adds years 1977, 1982, and 1987 where available, so covers all censuses from 1977-2012

NOTE: CMI is still missing cfn from 1977 to 1997, so only using CMI from 2002 to 2012 when survu_id is available
        
note: also updates LBD vers to 2016

Uses new economic data library paths

This program pulls in data from the economic censuses to make the main dataset
1977-2012
Includes AUX establishments (important pre-1997; post-1997 they are in the CSR)

The first macro brings in the sales, emp, va, etc data by industry from the Censuses
The second macro brings in employment data from the LBD

This program makes a dataset for each census year with sales, emp & industry, and with lbdnum and lbd emp

This program uses the latest version of fknaics.

To-Do:
- check and create dictionary cheatshgeets for other (non-CMF) censuses. for now using old variables list
*/


/*
Extensive edits by TF on 3/12/20

Edits made:
  1) 	No longer merge to the LBD and FK NAICS
  2) 	Keep duplicate records across ECs to preserve collected information as possible
  3) 	Tried to make records with all information.  NOTE: when different ECs have the same var names, need to correct for that
  4) 	Only sales and emp done carefully now.  
  5) 	When using information from duplicate records, need to check those variables across ECs carefully

Dupe Selection:
  1) 	Prior to 2002 there is no tabbed variable.
  2) 	Prior to 2002, if emp>0 use the aux record.  If not that one, select based on highest emp
  3)	Post 2002, use tabbed.  Otherwise use highest emp variable

*/

options obs=max ;
* options obs=100000 ;

%include 'yyyy/pdata.sas';
    



libname ecroot 'xxxx/data';
%let ecroot= xxxx/data ;
%let lbdvers= _c201600 ; /* used for lbd and for fknaics */


* MACRO 1: BRING IN SALES DATA FROM THE CENSUSES;
************************************************************************************;

* Make firm sales and employment;

**Get firm sales from the censuses;
***************************************************************;
**Define yearly lists;
    
*CCN: census of contruction;
*starts in 1972 but only have access starting in 1977;
%let ccn_1977=ate cfn einum sic tr va cm pscomm tce nm nt ns psrm psrs pstotl um us;
%let ccn_1982=act ate cfn einum sic tr va cm pscomm tce nm nt ns psrm psrs pstotl um us;
%let ccn_1987=act ate cfn einum sic tr va cm pscomm tce nm nt ns psrm psrs pstotl um us;
%let ccn_1992=act ate cfn einum sic tr va cm pscomm tce nm nt ns psrm psrs pstotl um us; /*sic char4, missing alpha*/
%let ccn_1997=ate cfn einum naics tr va cm sic4 pscomm tce psrm psrs pstotl; /*sic4 char4*/
%let ccn_2002=actvstat alpha ein empavt naicold naicnew rcptot survu_id tabstat valaddc cstmprt pchlgls pchacts pchcsvc pchadvt cextot pchctot rcpecomt;
%let ccn_2007=alpha ein empavg naics_old naics_new sales_t survu_id tabbed valaddc cprte cpc ca ccmpq cexso cdapr tce crep crm ctax tps coth naics_aux_new;
%let ccn_2012=alpha ein empavg naicnew naicold rcptot survu_id tabstat valaddc cstmprt pchprte pchcsvc pchadvt pchcmpq pchexso pchdapr cextot pchrpr pchrfus pchtax pchtt pchoexp; 
*ccn2012 is titled ccn.ccn2012_final, 139k obs, roughly similar to the 145k obs in 2007;
*cp is cost of materials and parts, cstmprt is the same;
*cstcmt is selected costs, total, we dont use this;
*edited to include cstmpri for ccn2002;

*CFI: census of finance, insurance, and real estate;
*begins in 1992;
    
%let cfi_1992=alpha cfn ein empl sales sic tabbed; /*sic char6*/
%let cfi_1997=alpha cfn ein empl naics_new sales sic tabbed; /*sic char6*/
%let cfi_2002=active alpha ein empl naics_new naics_old sales survu_id tabbed;
%let cfi_2007=active alpha ein empl naics_old naics_new sales sales_t survu_id tabbed;
%let cfi_2012=active alpha ein empl naics_new naics_old sales sales_t survu_id tabbed;

*CMF: census of manufactures;
*begins in 1977;
%let cmf_1977=bridge cfn ei firmid ind te tvs nind cm cp ipt fcm: cls cba cpc ca cs rm rbs crm ue um ub nm: cex nb tme tce;
%let cmf_1982=bridge cfn ei firmid ind te tvs nind cm cp ipt fcm: cls cba cpc ca cs rm rbs crm ue um ub nm: cex nb tme tce;
%let cmf_1987=bridge cfn ei firmid ind te tvs nind cm cp ipt fcm: cls cba cpc ca cs rm rbs crm ue um ub nm: cex nb tme tce;
%let cmf_1992=bridge cfn ei firmid ind te tvs nind cm cp ipt fcm: cls cba cpc ca cs rm rbs crm ue um ub nm: cex nb tme tce;
%let cmf_1997=bridge cfn ei firmid ind te tvs nind cm cp exp ipt fcm: cls cba cpc ca cs rm rbs crm ue um ub nm: nb tme tce;
%let cmf_2002=alpha active survu_id ein firmid naics_new te tvs tabbed cm cp survu_type ecom: cmec ipt exp cmea cmec cbe cme cmeo ccomp cls cba cman cpc ca crep crm ctax coth tps tme tce naics_aux_a form_mailed mailflg;
%let cmf_2007=alpha survu_id ein firmid naics_new te tvs tabbed cm cp survu_type ecom: ipt exp cdapr cexso cmea cmec cbe cme cmeo ctemp cprte cpc ca crep crm ctax coth tps tme tce naics_aux_a form_mailed mailflg;
%let cmf_2012=alpha survu_id ein firmid naics_new te tvs tabbed cm cp survu_type ecom: ipt exp cdapr cexso cmea cmec cbe cme cmeo ctemp cprte cpc ca crep crm ctax coth tps tme tce naics_aux_a form_mailed mailflg;

%let cmf_all= ar exp firmid oe ow pw sw tab tae tce tr te tme tvs va ww ee cf cr cw td;

        
*Census of Mining: Note that there are also production and non-production workers here: ww;
*starts in 1987;
*cfn variable is still screwed up: only an id variable available in 1987 and 1992;
*1997: only cfn6;
%let cmi_1987=id ein nind oind te vs cm va vr ww cpc cex cbe cld cme nb ne nm;
%let cmi_1992=id ein cm ind te tvps tvs va cm cpc cex cbe cld cme nb ne nm; /*ind char4*/
%let cmi_1997=cfn6 ein nind6 te tvps va cm oind cls cba cpc ca cs cex crm; /* oind char4 */
%let cmi_2002=actvstat alpha ein empq1 naicold naicnew rcptot survu_id tabstat valaddm cstmtot pchlgls pchacts pchcsvc pchadvt cextot pchtt pchoexp cexnd cexnl cexnt;
%let cmi_2007=ein alpha naics_old naics_new tvps survu_id tabbed te tvs va cm cprte cpc ca ccmpq cexso cdapr tce crep crm ctax tps coth cexnd cexnl cexnt naics_aux_new;
%let cmi_2012=alpha ein empq1 naicold naicnew rcptot survu_id tabstat valaddm cstmtot pchprte pchcsvc pchadvt pchcmpq pchexso pchdapr cextot pchrpr pchrfus pchtax pchtt pchoexp cexnd cexnl cexnt;

*CWH;
%let cwh_1977=alpha cfn ein empl firmid sales sic;
%let cwh_1982=alpha cfn ein empl firmid sales sic;
%let cwh_1987=alpha cfn ein empl firmid sales sic tabbed;
%let cwh_1992=alpha cfn ein empl firmid sales sic tabbed;
%let cwh_1997=alpha cfn ein empl naics_new sic sales tabbed; /*sic char 6*/
%let cwh_2002=active alpha ein empl naics_new naics_old sales survu_id tabbed;
%let cwh_2007=active alpha ein empl naics_old naics_new sales survu_id tabbed;
%let cwh_2012=active alpha ein empl naics_old naics_new sales survu_id tabbed;


*Remaining Censuses: share the same variable list in each year;
%let rem_1977=alpha cfn ein empl firmid sales sic;
%let rem_1982=alpha cfn ein empl firmid sales sic;
%let rem_1987=alpha cfn ein empl firmid sales sic tabbed;
%let rem_1992=alpha cfn ein empl firmid sales sic tabbed;
%let rem_1997=alpha cfn ein empl naics_new sic sales tabbed; /*sic char 6*/
%let rem_2002=active alpha ein empl naics_new naics_old sales survu_id tabbed naics_aux_new;
%let rem_2007=active alpha ein empl naics_old naics_new sales survu_id tabbed naics_aux_new;
%let rem_2012=active alpha ein empl naics_old naics_new sales survu_id tabbed naics_aux_new;

*Census of Aux; /* only needed for 97 and before, after which exists in other censuses, update pulling for all years now */
%let aux_1977 = cfn ein fcsic femp fapr fsls alpha fcom fcet fcec fcea fceom fceb fbr fmr fced fceu fnce frdt fwhs fedp fsal frad
		faux fcao fone fto
;
%let aux_1982 = cfn ein fcsic femp fapr fsls alpha fcom fcet fcec fcemt fcea fceom fceb fbr fmr fceu fnce frdt fwhs fedp fsal frad
		fall faux fcao fhqr fpact fto
;
%let aux_1987 = act cfn ein f7sic femp87 fapr87 fsls87 alpha fcom fadv fcet fcec fcemt fcea fceom fceb fbr fmr fced fceu fceom frdt fwhs fedp fsal frad
		faux fmto fpact fto
;    
%let aux_1992 = act cfn ein f92sic femp92 fapr92 fsls92 fleg facc fcom fadv fdp fcet fcec fcemt fcea fceom fceb fbr fmr fced fceu fceom frdt fwhs fedp fsal frad
		faux fmto fpact fto
;
%let aux_1997 = cfn ein naics sic c032 c010 c070 ;





**Lists for expenses and costs, etc., for CMF and CWH;
%let costs_cwh_2012=gross_margin gross_profit merch merch_r opexp opexp_r state_fips sales_t optype ecom: ;
%let costs_cwh_2007=gross_margin gross_profit merch merch_r opexp opexp_r state_fips sales_t optype ecom: ;
%let costs_cwh_2002=gross_margin gross_profit merch merch_r opexp opexp_r state_fips sales_t optype ecom: ;
%let costs_cwh_1997=gross_margin gross_profit merch merch_r opexp opexp_r state_fips optype;
%let costs_cwh_1992=gross_margin gross_profit merch merch_r opexp opexp_r state_fips optype;
%let costs_cwh_1987=opexp opexp_r state_fips optype;
%let costs_cwh_1982=opexp opexp_r state_fips optype;
%let costs_cwh_1977=opexp opexp_r state_fips optype;

%macro firm_census(y);
*Pull in 1977-1997 census data;
%if &y<2001 %then %do;
    
*pull in 1977 data;
%if &y=1977 %then %do;	
data census&y;
  set data_008.ccn&y.     (keep=&&ccn_&y..     in=a)
      /* nothing for cfi in 1977 */
      data_014.cmf&y.     (keep=&&cmf_&y..  &cmf_all. in=c rename=(firmid=firmidcmf tr=trcmf va=va_cmf)) 
      /* nothing for cmi in 1977 */
      data_017.crt&y.base(keep=&&rem_&y..     in=e rename=(sic=sic6))
      data_018.csr&y.base(keep=&&rem_&y..     in=f rename=(ein=ein_t sic=sic6))
      /* nothing for cut in 1977 */ 
      data_020.cwh&y.base(keep=&&cwh_&y.. &&costs_cwh_&y..   in=h rename=(sic=sic6));

    *Create variable to keep track of what file the record came from ;
       if a=1        then flag_a=1 ;
         else if b=1 then flag_b=1 ;
         else if c=1 then flag_c=1 ;
         else if d=1 then flag_d=1 ;
         else if e=1 then flag_e=1 ;
         else if f=1 then flag_f=1 ;
         else if g=1 then flag_g=1 ;
         else if h=1 then flag_h=1 ;
 
    /* no tabbed variable in 1977 */
run;
%end;
    
*pull in 1982 data;
%if &y=1982 %then %do;	
data census&y;
  set data_008.ccn&y.     (keep=&&ccn_&y..     in=a)
      /* nothing for cfi in 1982 */
      data_014.cmf&y.     (keep=&&cmf_&y. &cmf_all. in=c rename=(firmid=firmidcmf tr=trcmf va=va_cmf)) 
      /* nothing for cmi in 1982 */
      data_017.crt&y.base(keep=&&rem_&y..     in=e rename=(sic=sic6))
      data_018.csr&y.base(keep=&&rem_&y..     in=f rename=(ein=ein_t sic=sic6))
     /* nothing for cut in 1982 */ 
      data_020.cwh&y.base(keep=&&cwh_&y.. &&costs_cwh_&y..   in=h rename=(sic=sic6));
    *Create variable to keep track of what file the record came from ;
       if a=1        then flag_a=1 ;
         else if b=1 then flag_b=1 ;
         else if c=1 then flag_c=1 ;
         else if d=1 then flag_d=1 ;
         else if e=1 then flag_e=1 ;
         else if f=1 then flag_f=1 ;
         else if g=1 then flag_g=1 ;
         else if h=1 then flag_h=1 ;

    /* no tabbed variable in 1982 */
run;
%end;
    
*pull in 1987 data;
%if &y=1987 %then %do;	
data census&y;
  set data_008.ccn&y.     (keep=&&ccn_&y..     in=a)
      /* nothing for cfi in 1987 */
      data_014.cmf&y.     (keep=&&cmf_&y. &cmf_all. in=c rename=(firmid=firmidcmf tr=trcmf va=va_cmf)) 
       /* data_015.cmi&y     (keep=&&cmi_&y     in=d rename=(id=cfn nind=sic vs=tvps)) */
      data_017.crt&y.base(keep=&&rem_&y..     in=e rename=(sic=sic6))
      data_018.csr&y.base(keep=&&rem_&y..     in=f rename=(ein=ein_t sic=sic6))
      data_019.cut&y.base(keep=&&rem_&y..     in=g rename=(sic=sic6)) 
      data_020.cwh&y.base(keep=&&cwh_&y.. &&costs_cwh_&y..   in=h rename=(sic=sic6));
    *Create variable to keep track of what file the record came from ;
       if a=1        then flag_a=1 ;
         else if b=1 then flag_b=1 ;
         else if c=1 then flag_c=1 ;
         else if d=1 then flag_d=1 ;
         else if e=1 then flag_e=1 ;
         else if f=1 then flag_f=1 ;
         else if g=1 then flag_g=1 ;
         else if h=1 then flag_h=1 ;

        /* tabbed variable available for some but not others;
            just clean out those with tabbed = 'N' */
        * if tabbed~='N';  * keep all records to collect all info;
run;
%end;
    
*pull in 1992 data;
%if &y=1992 %then %do;	
data census&y;
  set data_008.ccn&y     (keep=&&ccn_&y..     in=a)
      data_009.cfi&y.base(keep=&&cfi_&y..     in=b rename=(sic=sic6))
      data_014.cmf&y     (keep=&&cmf_&y. &cmf_all. in=c rename=(firmid=firmidcmf tr=trcmf va=va_cmf)) 
      /* data_015.cmi&y     (keep=&&cmi_&y..     in=d rename=(id=cfn ind=sic)) */
      data_017.crt&y.base(keep=&&rem_&y..     in=e rename=(sic=sic6))
      data_018.csr&y.base(keep=&&rem_&y..     in=f rename=(ein=ein_t sic=sic6))
      data_019.cut&y.base(keep=&&rem_&y..     in=g rename=(sic=sic6)) 
      data_020.cwh&y.base(keep=&&cwh_&y.. &&costs_cwh_&y..     in=h rename=(sic=sic6));
    *Create variable to keep track of what file the record came from ;
       if a=1        then flag_a=1 ;
         else if b=1 then flag_b=1 ;
         else if c=1 then flag_c=1 ;
         else if d=1 then flag_d=1 ;
         else if e=1 then flag_e=1 ;
         else if f=1 then flag_f=1 ;
         else if g=1 then flag_g=1 ;
         else if h=1 then flag_h=1 ;
   * if tabbed~='N';
run;
%end;

*pull in 1997 data;
%if &y=1997 %then %do;
data census&y ;
  set data_008.ccn&y     (keep=&&ccn_&y..     in=a rename=(sic4=sic))
      data_009.cfi&y.base(keep=&&cfi_&y..     in=b rename=(sic=sic6))
      data_014.cmf&y     (keep=&&cmf_&y. &cmf_all. in=c rename=(firmid=firmidcmf tr=trcmf va=va_cmf) )
     /* data_015.cmi&y     (keep=&&cmi_&y..     in=d rename=(oind=sic)) */
      data_017.crt&y.base(keep=&&rem_&y..     in=e rename=(sic=sic6))
      data_018.csr&y.base(keep=&&rem_&y..     in=f rename=(ein=ein_t sic=sic6))
      data_019.cut&y.base(keep=&&rem_&y..     in=g rename=(sic=sic6)) 
      data_020.cwh&y.base(keep=&&cwh_&y.. &&costs_cwh_&y..   in=h rename=(sic=sic6));

    *Create variable to keep track of what file the record came from ;
       if a=1        then flag_a=1 ;
         else if b=1 then flag_b=1 ;
         else if c=1 then flag_c=1 ;
         else if d=1 then flag_d=1 ;
         else if e=1 then flag_e=1 ;
         else if f=1 then flag_f=1 ;
         else if g=1 then flag_g=1 ;
         else if h=1 then flag_h=1 ;
  * if tabbed~='N';
run;
%end;
	
 *Fix all variables;
 data census&y (drop=temp: einum ein_t tr tvps sic ate empl sic6 sic nind ind oind naics naics_new nind6 cfn6);
   length firmid $ 10 temp $ 8 temp2 $ 6 naics97_census $ 6  sic_census $ 4 naics02_census $ 6 naics07_census $ 6 naics12_census $ 6;
   set census&y;	    
   
  *Make correct ein variable;
     if flag_f=1 then ein=left(trim(ein_t));
     if flag_a=1 then ein=einum;

   *Make firmid variable and alpha;
   if substr(cfn,1,1)='0' then firmid=cfn;
   if substr(cfn,1,1)~='0' then firmid=substr(cfn,1,6)||'0000';
   *fix alpha for ccn, cmf, cmi, aux ;
    if flag_a=1 & substr(cfn,1,1)~='0' then alpha=substr(firmid,1,6);
    if flag_c=1 & substr(cfn,1,1)~='0' then alpha=substr(firmid,1,6);
    if flag_d=1 & substr(cfn,1,1)~='0' then alpha=substr(firmid,1,6);

  *Makes sales variable;
    if flag_a=1 then sales=tr;
    if flag_d=1 then sales=tvps;
    if flag_c=1 then sales=tvs;

    if flag_a=1 then sales_ccn=tr;
    if flag_b=1 then sales_cfi=sales ;
    if flag_c=1 then sales_cmf=tvs;
    if flag_d=1 then sales_cmi=tvps;
    if flag_e=1 then sales_crt=sales ;
    if flag_f=1 then sales_csr=sales ;
    if flag_g=1 then sales_cut=sales ;
    if flag_h=1 then sales_cwh=sales ;


  *Make census employment variable;
   if flag_a=1 then emp_cen=ate;
   if flag_b=1 | flag_e=1 | flag_f=1 | flag_g=1 | flag_h=1 then emp_cen=empl;
   if flag_c=1 | flag_d=1 then emp_cen=te;

   if flag_a=1 then emp_ccn=ate;
   if flag_b=1 then emp_cfi=empl;
   if flag_c=1 then emp_cmf=te;
   if flag_d=1 then emp_cmi=te;
   if flag_e=1 then emp_crt=empl;
   if flag_f=1 then emp_csr=empl;
   if flag_g=1 then emp_cut=empl;
   if flag_h=1 then emp_cwh=empl;


/* 1977: flag_a,d has sic, flag_b has sic6, flag_c has ind, flag_e,f,g,h,x have sic6. */
  %if &y=1977 %then %do;
      temp=ind;
      temp2=left(trim(temp));
      if flag_b=1 | flag_e=1 | flag_f=1 | flag_g=1 | flag_h=1 then sic_census=substr(sic6,1,4);
      if flag_c=1 then sic_census=substr(temp2,1,4);
      if flag_a=1 | flag_d=1 then sic_census = sic;
      naics97_census=' ';
      naics02_census=' ';
      naics07_census=' ';
      naics12_census=' ';
      %end;

/* 1982: flag_a,d has sic, flag_b has sic6, flag_c has ind, flag_e,f,g,h,x have sic6. */
  %if &y=1982 %then %do;
      temp=ind;
      temp2=left(trim(temp));
      if flag_b=1 | flag_e=1 | flag_f=1 | flag_g=1 | flag_h=1 then sic_census=substr(sic6,1,4);
      if flag_c=1 then sic_census=substr(temp2,1,4);
      if flag_a=1 | flag_d=1 then sic_census = sic;
      naics97_census=' ';
      naics02_census=' ';
      naics07_census=' ';
      naics12_census=' ';
      %end;

/* 1987: flag_a,d has sic, flag_b has sic6, flag_c has ind, flag_e,f,g,h,x have sic6. */
  %if &y=1987 %then %do;
      temp=ind;
      temp2=left(trim(temp));
      if flag_b=1 | flag_e=1 | flag_f=1 | flag_g=1 | flag_h=1 then sic_census=substr(sic6,1,4);
      if flag_c=1 then sic_census=substr(temp2,1,4);
      if flag_a=1 | flag_d=1 then sic_census = sic;
      naics97_census=' ';
      naics02_census=' ';
      naics07_census=' ';
      naics12_census=' ';
      %end;

/* 1992: flag_a,d has sic, flag_b has sic6, flag_c has ind, flag_e,f,g,h,x have sic6. */
/* resulting dataset: sic (flag a, d), sic_census(flag b,c,e,f,g,h), sic6 (flag b,e,f,g,h)*/
  *Make industry variable;
  %if &y=1992 %then %do;
      temp=ind;
      temp2=left(trim(temp));
      if flag_b=1 | flag_e=1 | flag_f=1 | flag_g=1 | flag_h=1  then sic_census=substr(sic6,1,4);
      if flag_c=1 then sic_census=substr(temp2,1,4);
      if flag_a=1 | flag_d=1 then sic_census = sic;
      naics97_census=' ';
      naics02_census=' ';
      naics07_census=' ';
      naics12_census=' ';
      %end;

  %if &y=1997 %then %do;
      temp=ind; /* cmf is the only census with 83 recs missing naics97, so use sic code as well*/
      temp2=left(trim(temp));
      naics97_census=substr(naics_new,1,6); /*picks up flag_b, e-h, x */
      if flag_c=1 then naics97_census=nind;
      if flag_a=1 then naics97_census=naics;
      if flag_d=1 then naics97_census=nind6;
      sic_census=' ';
      if flag_b=1 | flag_e=1 | flag_f=1 | flag_g=1 | flag_h=1 then sic_census=substr(sic6,1,4);
      if flag_c=1 then sic_census=substr(temp2,1,4);
      if flag_a=1 | flag_d=1 then sic_census = sic;
      naics02_census=' ';
      naics07_census=' ';
      naics12_census=' ';
      %end;
  *Make source variable;
       if flag_a=1        then source='ccn' ;
         else if flag_b=1 then source='cfi' ;
         else if flag_c=1 then source='cmf' ;
         else if flag_d=1 then source='cmi' ;
         else if flag_e=1 then source='crt' ;
         else if flag_f=1 then source='csr' ;
         else if flag_g=1 then source='cut' ;
         else if flag_h=1 then source='cwh' ;

run;
%end;


***Pull in 2002 Census data;
%if &y=2002 %then %do;
data census&y(drop=temp: surv: ein_num ein_l empavt empq1 empl rcptot sales_t naics_old naics_new naicnew naicold);
    length firmid $ 10 temp2 $ 10 temp3 $ 11 naics97_census $ 6 naics02_census $ 6 sic_census $ 4 naics07_census $ 6 naics12_census $ 6 ;
  set data_008.ccn&y     (keep=&&ccn_&y     in=a rename=(ein=ein_num survu_id=surv_num cstmprt=cm_ccm valaddc=va_ccn))
      data_009.cfi&y.base(keep=&&cfi_&y     in=b)
      data_014.cmf&y.     (keep=&&cmf_&y. &cmf_all.  in=c rename=(ein=ein_num firmid=firmidcmf survu_type=estabstype tr=trcmf va=va_cmf))
      data_015.cmi&y.     (keep=&&cmi_&y     in=d rename=(ein=ein_num survu_id=surv_num valaddm=va_cmi cstmtot=cm_cmi))
      data_017.crt&y.base(keep=&&rem_&y     in=e)
      data_018.csr&y.base(keep=&&rem_&y     in=f rename=(ein=ein_l))
      data_019.cut&y.base(keep=&&rem_&y     in=g)
      data_020.cwh&y.base(keep=&&cwh_&y &&costs_cwh_&y..    in=h);
   *Make correct ein variable;
     if ein_num~=. then do;
	 ein=put(ein_num,z9.);
     end;
    if f=1 then ein=left(trim(ein_l)); /*now all variables back to ein*/
   *Make cfn variable;
    if a=1 | d=1 then do;
      temp2=surv_num;
      cfn=temp2;
      end;
    if b=1 | c=1 | e=1 | f=1 | g=1 | h=1 then cfn=survu_id;  
  *Make firmid variable;
    if alpha~=' ' then firmid=alpha||"0000";
    if alpha=' ' then firmid='0'||ein;
   
  *Make activity code flag;
    if a=1 | d=1 then active=actvstat;
  
  *Make tabbed flag;
    if a=1 | d=1 then tabbed=tabstat;

  *Make employment variable;
   if a=1 then emp_cen=empavt;
   if b=1 | e=1 | f=1 | g=1 | h=1 then emp_cen=empl;
   if c=1 then emp_cen=te;
   if d=1 then emp_cen=empq1;

   if a=1 then emp_ccn=empavt;
   if b=1 then emp_cfi=empl;
   if c=1 then emp_cmf=te;
   if d=1 then emp_cmi=empq1;
   if e=1 then emp_crt=empl;
   if f=1 then emp_csr=empl;
   if g=1 then emp_cut=empl;
   if h=1 then emp_cwh=empl;

   *Create variable to keep track of what file the record came from ;
       if a=1        then source='ccn' ;
         else if b=1 then source='cfi' ;
         else if c=1 then source='cmf' ;
         else if d=1 then source='cmi' ;
         else if e=1 then source='crt' ;
         else if f=1 then source='csr' ;
         else if g=1 then source='cut' ;
         else if h=1 then source='cwh' ;
   
  *Make sales variable;
   if a=1 | d=1 then sales=rcptot;
   if c=1 then sales=tvs;

    if a=1 then sales_ccn=rcptot;
    if b=1 then sales_cfi=sales ;
    if c=1 then sales_cmf=tvs;
    if d=1 then sales_cmi=rcptot;
    if e=1 then sales_crt=sales ;
    if f=1 then sales_csr=sales ;
    if g=1 then sales_cut=sales ;
    if h=1 then sales_cwh=sales ;
   

   *Make industry variables;
   naics02_census=substr(naics_new,1,6);
   if a=1 | d=1 then naics02_census=substr(naicnew,1,6);
   naics97_census=substr(naics_old,1,6);
   if a=1 | d=1 then naics97_census=substr(naicold,1,6);
   sic_census=' ';
   naics07_census=' ';
   naics12_census=' ' ;
*if tabbed~='N' ;  
/*if tabbed~='N' & active~='N';*/
run;
%end;

***Pull in 2007 Census data;
%if &y=2007 %then %do;
data census&y(drop=ein_t temp: surv: ein_l naics_old sales_t naics_new empl empavg);
    length firmid $ 10 temp2 $10  naics97_census $ 6 naics02_census $ 6 sic_census $ 4 naics07_census $ 6 naics12_census $ 6 ;
    set data_008.ccn&y       (keep=&&ccn_&y     in=a rename=(survu_id=temp1 ein=ein_t valaddc=va_ccn) )
      data_009.cfi&y.base    (keep=&&cfi_&y     in=b)
      data_014.cmf&y.         (keep=&&cmf_&y. &cmf_all   in=c rename=(ein=ein_t firmid=firmidcmf survu_type=estabstype tr=trcmf va=va_cmf))
      data_015.cmi&y         (keep=&&cmi_&y     in=d rename=(survu_id=temp1 ein=ein_t))
      data_017.crt&y.base    (keep=&&rem_&y     in=e)
      data_018.csr&y.base    (keep=&&rem_&y     in=f rename=(ein=ein_l))
      data_019.cut&y.base    (keep=&&rem_&y     in=g) 
      data_020.cwh&y.base    (keep=&&cwh_&y &&costs_cwh_&y..   in=h);
   
 *Make fixed ein variable;
    if ein_t~=. then do;
	 ein=put(ein_t,z9.);
     end;
     if f=1 then ein=left(trim(ein_l));
			  
   *Make cfn variable;
   if a=1 | d=1 then do;
       temp2=temp1;
       cfn=temp2;
       end;
    if b=1 | c=1| e=1 | f=1 | g=1 | h=1 then cfn=survu_id; 
   
  *Make firmid variable;
    if alpha~=' ' then firmid=alpha||"0000";
    if alpha=' ' then firmid='0'||ein;
   
  *Make sales variable;
    if a=1 then sales=sales_t;
    if a=1 | c=1 | d=1 then sales=tvs;

    if a=1 then sales_ccn=sales;
    if b=1 then sales_cfi=sales ;
    if c=1 then sales_cmf=sales;
    if d=1 then sales_cmi=sales;
    if e=1 then sales_crt=sales ;
    if f=1 then sales_csr=sales ;
    if g=1 then sales_cut=sales ;
    if h=1 then sales_cwh=sales ;


   *Make naics variables;
    naics02_census=substr(naics_old,1,6);
    naics07_census=substr(naics_new,1,6);
    sic_census=' ';
    naics97_census=' ';
    naics12_census=' ' ;

   *Make employment variable;
    if a=1 then emp_cen=empavg;
    if b=1 | e=1 | f=1 | g=1 | h=1 then emp_cen=empl;
    if c=1 | d=1 then emp_cen=te;

   if a=1 then emp_ccn=emp_cen;
   if b=1 then emp_cfi=emp_cen;
   if c=1 then emp_cmf=emp_cen;
   if d=1 then emp_cmi=emp_cen;
   if e=1 then emp_crt=emp_cen;
   if f=1 then emp_csr=emp_cen;
   if g=1 then emp_cut=emp_cen;
   if h=1 then emp_cwh=emp_cen;
 
   *Create variable to keep track of what file the record came from ;
       if a=1        then source='ccn' ;
         else if b=1 then source='cfi' ;
         else if c=1 then source='cmf' ;
         else if d=1 then source='cmi' ;
         else if e=1 then source='crt' ;
         else if f=1 then source='csr' ;
         else if g=1 then source='cut' ;
         else if h=1 then source='cwh' ;
*if tabbed~='N' ;
*if tabbed~='N' & active~='N';
run;
%end;

***Pull in 2012 Census data (No longer geo-data);
%if &y=2012 %then %do;
data census&y(drop=temp: surv: ein_t ein_l tabstat rcptot naicnew naicold naics_new naics_old empavg empl empq1);
    length firmid $ 10 temp2 $10 naics97_census $ 6 naics02_census $ 6 sic_census $ 4 naics07_census $ 6 naics12_census $ 6;
    set data_008.ccn&y._final       (keep=&&ccn_&y     in=a rename=(survu_id=temp1 ein=ein_t cstmprt=cm_cmi valaddc=va_ccn) )
      data_009.cfi&y.base     (keep=&&cfi_&y     in=b)
      data_014.cmf&y         (keep=&&cmf_&y. &cmf_all. in=c rename=(ein=ein_t firmid=firmidcmf survu_type=estabstype tr=trcmf va=va_cmf))
      data_015.cmi&y._final         (keep=&&cmi_&y     in=d rename=(survu_id=temp1 ein=ein_t valaddm=va_cmi cstmtot=cm_cmi))
      data_017.crt&y.base     (keep=&&rem_&y     in=e)
      data_018.csr&y.base     (keep=&&rem_&y     in=f rename=(ein=ein_l))
      data_019.cut&y.base     (keep=&&rem_&y     in=g) 
      data_020.cwh&y.base     (keep=&&cwh_&y &&costs_cwh_&y..   in=h);
   
 *Make fixed ein variable;
    if ein_t~=. then do;
	 ein=put(ein_t,z9.);
     end;
     if f=1 then ein=left(trim(ein_l));
 
   *Make cfn variable;
   if a=1 | d=1 then do;
       temp2=temp1;
       cfn=temp2;
       end;
    if b=1 | c=1 |  e=1 | f=1 | g=1 | h=1 then cfn=survu_id; 

   *Make firmid variable;
    if alpha~=' ' then firmid=alpha||"0000";
    if alpha=' ' then firmid='0'||ein;

   *Make tabbed flag;
    if a=1 | d=1 then tabbed=tabstat;

   *Make sales variable;
    if a=1 | d=1 then sales=rcptot;
    if c=1 then sales=tvs;

    if a=1 then sales_ccn=sales;
    if b=1 then sales_cfi=sales ;
    if c=1 then sales_cmf=sales;
    if d=1 then sales_cmi=sales;
    if e=1 then sales_crt=sales ;
    if f=1 then sales_csr=sales ;
    if g=1 then sales_cut=sales ;
    if h=1 then sales_cwh=sales ;

   *Make industry variables;
   naics12_census=substr(naics_new,1,6);
   if a=1 | d=1 then naics12_census=substr(naicnew,1,6);
   naics07_census=substr(naics_old,1,6);
   if a=1 | d=1 then naics07_census=substr(naicold,1,6);
   sic_census=' ';
   naics02_census=' ';
   naics97_census=' ';

   *Make employment variable;
    if a=1 then emp_cen=empavg;
    if b=1 | e=1 | f=1 | g=1 | h=1 then emp_cen=empl;
   if c=1 then emp_cen=te;
   if d=1 then emp_cen=empq1;

   if a=1 then emp_ccn=emp_cen;
   if b=1 then emp_cfi=emp_cen;
   if c=1 then emp_cmf=emp_cen;
   if d=1 then emp_cmi=emp_cen;
   if e=1 then emp_crt=emp_cen;
   if f=1 then emp_csr=emp_cen;
   if g=1 then emp_cut=emp_cen;
   if h=1 then emp_cwh=emp_cen;

   *Create variable to keep track of what file the record came from ;
       if a=1        then source='ccn' ;
         else if b=1 then source='cfi' ;
         else if c=1 then source='cmf' ;
         else if d=1 then source='cmi' ;
         else if e=1 then source='crt' ;
         else if f=1 then source='csr' ;
         else if g=1 then source='cut' ;
         else if h=1 then source='cwh' ;

* if tabbed~='N';

run;
%end;


****First check for cfn dupes within a year;
proc sort data=census&y;
    by cfn;
run;
    
data census&y._unique dupes&y;
    set census&y;
    by cfn;
    if first.cfn=1 & last.cfn=1 then output census&y._unique;
    else output dupes&y;
run;

proc sort data=dupes&y; by cfn emp_cen ; run;

%mend;


*Run the macro to pull census variables ;
****************************************;
%macro loop1();
  %do y=1977 %to 2012 %by 5;
     %firm_census(&y);
  %end;
%mend;
%loop1();
****************************************;

**DEAL WITH THE DUPES;


***Years differ in terms of activity and tabbed flags;
proc freq data=census1982; tables act; run; 
proc freq data=census1987; tables act; run; 
proc freq data=census1992; tables act; run;
proc freq data=census1997; tables tabbed; run;
proc freq data=census2002; tables tabbed; run;
proc freq data=census2007; tables tabbed; run;
proc freq data=census2012; tables tabbed; run;




%macro add_dupes();
   %do y=1977 %to 2012 %by 5;


%if &y<2001 %then %do;
   data dupes_add&y._1 dupes2_&y ;
	set dupes&y;
	by cfn emp_cen; 
	if emp_cen>0 & emp_cen~=. & source="aux" then dupe_priority=1; 
	if dupe_priority=. & last.cfn=1 then dupe_priority=2; 
	if dupe_priority=1 then output dupes_add&y._1 ;
	if dupe_priority=2 then output dupes2_&y ;
    run;

  *Take the 2s when needed;
  proc sort data=dupes_add&y._1 ; by cfn; run;
  proc sort data=dupes2_&y ; by cfn; run;

  data dupes_add&y._2 ;
	merge dupes_add&y._1(in=a) dupes2_&y ;
	by cfn; 
	if a=0 then output dupes_add&y._2;
  run;

  data dupes_add&y._3;
    set dupes_add&y._1 dupes_add&y._2;
  run;


  proc sort nodupkey data=dupes_add&y._3; by cfn; run;


%end;


    
%if &y>2001 %then %do;
    data dupes_add&y._0 dupes2_&y ;
	set dupes&y;
	by cfn emp_cen; 
	if tabbed='Y' then dupe_priority=1 ; 
	if dupe_priority=. & last.cfn=1 then dupe_priority=2; 
	if dupe_priority=1 then output dupes_add&y._0;
    run;

    proc sort data=dupes_add&y._0; by cfn emp_cen; run; 
    
    data dupes_add&y._1 dupes&y._still;
      set dupes_add&y._0;
      by cfn emp_cen ; 
      if last.cfn=1 then output dupes_add&y._1 ;
      else output dupes&y._still;
    run;

  *Now grab remainders;
  *Take the 2s when needed;
  proc sort data=dupes_add&y._1 ; by cfn; run;
  proc sort data=dupes2_&y ; by cfn; run;

  data dupes_add&y._2 ;
	merge dupes_add&y._1(in=a) dupes2_&y ;
	by cfn; 
	if a=0 then output dupes_add&y._2;
  run;

  data dupes_add&y._3;
    set dupes_add&y._1 dupes_add&y._2;
  run;


  proc sort nodupkey data=dupes_add&y._3; by cfn; run;


%end;


  proc sort data=dupes&y; by cfn; run;


*Add variables here as we care more about this;
*NOTE: NEED TO DO MORE CAREFUL RENAMING OF VARIABLES THAT HAVE THE SAME NAME ACROSS ECS;

%let dupe1977=ca cba cex cf cls cm cp cpc cr crm cs cw nm nmc nmnk ns nt exp 
	      ipt oe ow opexp pstotl pw rbs rm sw
	      tab tae tce td te tme trcmf ww va: ;

%let dupe1982=ca cba cex cf cls cm cp cpc cr crm cs cw nm nmc nmnk ns nt exp 
	      ipt oe ow opexp pstotl pw rbs rm sw
	      tab tae tce td te tme trcmf ww va: ;

%let dupe1987=ca cba cex cf cls cm cp cpc cr crm cs cw nm nmc nmnk ns nt exp 
	      ipt oe ow opexp pstotl pw rbs rm sw
	      tab tae tce td te tme trcmf ww va: ;


%let dupe1992=ca cba cex cf cls cm cp cpc cr crm cs cw nm nmc nmnk ns nt exp gross_margin
	      gross_profit ipt merch oe ow opexp pstotl pw rbs rm sw
	      tab tae tce td te tme trcmf ww va: ;

%let dupe1997=ca cba cf cls cm cp cpc cr crm cs cw nm nmc nmnk exp gross_margin
	      gross_profit ipt merch oe ow opexp pstotl pw rbs rm sw
	      tab tae tce td te tme trcmf ww va: ;

%let dupe2002=ca cbe cexnd cexnl cextot cf cm cme cmea cmec cmeo coth cp cpc cr crep crm ctax cw ecom_sales exp gross_margin
	      gross_profit ipt merch oe ow opexp pchadvt pchcsvc pchoexp  pchtt pw sw
	      tab tae tce td te tme tps trcmf ww va: ;

%let dupe2007=ca cbe cdapr cexnd cexnl cexso cf cm cme cmea cmec cmeo coth cp cpc cprte cr crep crm ctax ctemp cw ecom_sales exp gross_margin
	      gross_profit ipt merch oe ow opexp pw sw
	      tab tae tce td te tme tps trcmf ww va: ;

%let dupe2012=ca cbe cdapr cexnd cexnl cexso cextot cf cm cme cmea cmec cmeo coth cp cpc cprte cr crep crm ctax ctemp cw ecom_sales exp gross_margin
	      gross_profit ipt merch oe ow opexp pchadvt pchcmpq pchcsvc pchdapr pchexso pchoexp pchprte pchrfus pchtax pchtt pw sw
	      tab tae tce td te tme tps trcmf ww va: ;
	     

%let dupe_all= 	sales_ccn sales_cfi sales_cmf sales_cmi sales_crt sales_csr sales_cut sales_cwh 
		emp_ccn emp_cfi emp_cmf emp_cmi emp_crt emp_csr emp_cut emp_cwh 
		;

  proc means sum noprint data=dupes&y;
      by cfn; 
      vars &&dupe&y &dupe_all;
    output out=dupes_c&y sum()= ;
  run;

 %let main_list= cfn source sales emp_cen firmid sic_census naics97_census naics02_census naics07_census naics12_census;

  data dupes_add&y._4; 
    merge dupes_add&y._3(keep=&main_list in=a) dupes_c&y(in=b) ;
    by cfn; 
    if a=1 ;
  run;
  
%end; 

%mend ;
%add_dupes() ;



*Add selected dupes to the estab file;
****************************************;
%let drop_list=ei tvs b d flag_a flag_b flag_c flag_d flag_e flag_f     
	  flag_g flag_h flag_x g salx ;

%macro add_dupes();
   %do y=1977 %to 2012 %by 5;
      data all_estabs&y(drop=&drop_list);                                                                            
	  set census&y._unique dupes_add&y._4(in=a drop=_type_  rename=(_freq_=num_ec_dupes)) ;
	  if a=1 then dupe_flag=1;
	  year=&y;
      run;
   %end;
%mend ;
%add_dupes() ;
****************************************;

***MACRO 2: GET FKNAICS AND OTHER VARS FROM THE LBD;
************************************************************************************;
%macro firm_lbd(y);
    proc sort data=data_021.lbd&y.c&lbdvers. 
	    (keep=lbdnum cfn firmid emp naics sic bestnaics bestsic pay flag_i cbp state county zip
		rename=(sic=sic1 firmid=firmid_lbd naics=naics_lbd emp=emp_lbd pay=pay_lbd)) out=lbd;
	by lbdnum;
    run;

* get rid of cfn dupes- remove all improper CFNs - these have been checked beforehand;
    data lbd;
	set lbd;
	cfn_lbd=cfn;
	if length(cfn)=10;
    run;

  *Add the fk naics codes for ALL years;
    data fknaics;
	set data_021.naics&y.&lbdvers;
    run;
    
    proc sort data=fknaics;
	by lbdnum;
    run;
    
    data lbd;
	merge lbd(in=a) fknaics;
	by lbdnum;
	if a=1;
    run;

* get rid of cfn dupes within lbd using priority system;
    proc sort data=lbd;
	by cfn;
    run;
    
    data lbd&y lbdupes&y;
	set lbd;
	by cfn;
	if first.cfn=1 & last.cfn=1 then output lbd&y;
	else output lbdupes&y;
    run;
    
    data dupes_add;
        length fk_priority 3 impute_priority 3;
	set lbdupes&y;
	if fk_naics02~="" then fk_priority=1;
	else fk_priority=0;
	if flag_i='imputed record' then impute_priority = 0;
	else impute_priority=1;
    run;

    proc sort data=dupes_add; 
        by cfn fk_priority impute_priority emp_lbd pay_lbd; 
    run;

    data dupes_add;
	set dupes_add;
	by cfn fk_priority impute_priority emp_lbd pay_lbd;
	if last.cfn=1;
    run;
    
    data lbd&y;
	length was_dupe 3;
	set lbd&y(in=a) dupes_add(in=b);
	if b=1 then was_dupe=1;
    run;

    proc sort data=lbd&y;
	by cfn;
    run;
    
    proc sort data=all_estabs&y;
	by cfn;
    run;
    
    data all_estabs&y(drop=sic1) no_lbd_match&y(drop=sic1);
	length sic_lbd $ 4 sic $ 4;
	merge all_estabs&y(in=a rename=(firmid=firmid_cen)) lbd&y(in=b);
	by cfn;
       
        *Make industry SIC and NAICS variable - first trust census then LBD;
	sic_lbd=substr(sic1,1,4);
	sic=sic_census;
	if sic=' ' & sic_lbd~=' ' then sic=sic_lbd;
	if sic=' ' & bestsic~=' ' then sic=substr(bestsic,1,4);

        *Make firmid variable;
	firmid=firmid_cen;
	if firmid=' ' then firmid=firmid_lbd;
	if a=0 & b=1 then source="LBD";
        
        *if ASM emp missing, use LBD emp;
	emp=emp_cen;
        if emp=. | emp<=0 then emp=emp_lbd;

	* label where observations come from;
        if a=1 & b=0 then LBD_miss=1; 
        if a=0 & b=1 then CEN_miss=1; 
   
       *Label variables;
       label   LBD_miss="Record missing from LBD"
	       CEN_miss="Record missing from Census"
	       emp_lbd="Emp from LBD"
               emp_cen="Emp from Census" 
               EIN="EIN from Census"
               ALPHA="Alpha from Census";
       
      
       *Output datasets;
	if a=1 then output all_estabs&y;
	if a=1 & b=0 then output no_lbd_match&y;
    run;

%mend;

*Run the macro to add LBD information ;
****************************************;
%macro loop2();
  %do y=1977 %to 2012 %by 5;
     %firm_lbd(&y);
  %end;
%mend;
%loop2();
****************************************;
  
*Combine all years;
****************************************;
data ecroot.all_ec;
    set all_estabs1977 all_estabs1982 all_estabs1987 all_estabs1992 all_estabs1997 all_estabs2002 all_estabs2007 all_estabs2012;
run;
****************************************;



 


            
            
